# This script calculates the newborn MRS using five different calcuations reported in the literature.
# The area under the receiver operating characteristic curve was calcuated for each score in order to identify the best calcuation to select. 
# Scores were calculated using R, version 3.6.1

# Set working directory
setwd("/../../..")

# Load packages
library(pROC)
library(dplyr)

###############
### Score 1 ###
###############
# Load methylation data in the IOWBC for the CpGs selected from RFE feature selection - in the format of columns= CpGs, rows=samples
data <- read.csv("Beta_QN_autosome_combat_newborn_EWAS_6_CpGs_EPIC_862GU_asthma_747ID.csv", header=TRUE)
data$X <- NULL

# Get vector of CpG beta effect sizes - ensure order is the same as the CpG columns. Data found in Genomic_risk_score_features.xlsx, sheet CpGs included in newborn MRS
effect_sizes <- read.table("Newborn_MRS_CpG_effect_sizes.txt", header=T)

# Calculate MRS = sum(normalised CpG beta X CpG effect size)
data1 <- data
MRS1<- data1

cpgs <- colnames(MRS1)[2:7]
for (i in cpgs){
	MRS1[,i] <- MRS1[,i]*effect_sizes$OR_edited_for_score[which(effect_sizes$CpG ==i)]
}
MRS1$score <- rowSums(MRS1[, c(2:7)])

reg <- glm(Asthma_10YR~score, family=binomial, data=MRS1)
roc_data<- roc(reg$y, reg$fitted.values,ci=T)
#Area under the curve: 0.5497
#95% CI: 0.4956-0.6039 (DeLong)

write.csv(MRS1, "Newborn_MRS_Score1_data.csv", row.names=FALSE)

###############
### Score 2 ###
###############
# Calculate MRS= sum hypermethylated CpGs with levels in upper quartile of the distribution of controls and hypomethylated CpGs with levels in lower quartile of the distribution of controls
# MRSs were calculated as the sum of hypermethylated CpGs with methylation levels in the upper quartile of the distribution among controls, 
# and of hypomethylated CpGs with methylation levels in the lower quartile of the distribution among controls.

MRS2 <- data1

# Subset controls from dataset
controls <- subset(data1, Asthma_10YR==0)

# Identify CpGs which are hyper methylated
hypermethylated<- effect_sizes$CpG[which(effect_sizes$OR_edited_for_score >1)]

for (i in cpgs){
	if (i %in% hypermethylated){
	MRS2[,i] <- ifelse((MRS2[,i] > quantile(controls[,i], 0.75)),1,0)
	print(i)
	print(quantile(controls[,i], 0.75))
	} else {
	MRS2[,i] <- ifelse((MRS2[,i] < quantile(controls[,i], 0.25)),1,0)
	print(i)
	print(quantile(controls[,i], 0.25))
	}
}
		
MRS2$score <- rowSums(MRS2[, c(2:7)])

reg <- glm(Asthma_10YR~score, family=binomial, data=MRS2)
roc_data<- roc(reg$y, reg$fitted.values,ci=T)
#Area under the curve: 0.5388
#95% CI: 0.4848-0.5929 (DeLong)

# quantile thresholds:
	# "cg13289553" -	75% 0.04552853
	# "ch.6.1218502R" -	75% 0.03839596
	# "cg13427149" - 	75%	0.03014997
	# "cg17333211" -    75% 0.04020226
	# "cg02331902" - 	75%	0.02820954
	# "cg07156990" -    25% 0.849281

write.csv(MRS2, "Newborn_MRS_Score3_data.csv", row.names=FALSE)

###############
### Score 3 ###
###############
# Calculate 1/no of CpGs(sum(weight(beta value - mean of controls/sd of controls)))
# weights = +1/-1 for hyper/hypomethylated respectively

MRS3 <- data1

# Subset controls from dataset
controls <- subset(data1, Asthma_10YR==0)

# Identify CpGs which are hyper methylated
hypermethylated<- effect_sizes$CpG[which(effect_sizes$OR_edited_for_score >1)]

for (i in cpgs){
	control_mean = mean(controls[,i])
	control_sd = sd(controls[,i])
	if (i %in% hypermethylated){
	weight = 1
	MRS3[,i] <- (((MRS3[,i] - control_mean) / control_sd)*weight)
	} else {
	weight = -1
	MRS3[,i] <- (((MRS3[,i] - control_mean) / control_sd)*weight)
	}
}

MRS3$score <- (rowSums(MRS3[, c(2:7)])/length(cpgs))

reg <- glm(Asthma_10YR~score, family=binomial, data=MRS3)
roc_data<- roc(reg$y, reg$fitted.values,ci=T)
#Area under the curve: 0.4927
#95% CI: 0.436-0.5494 (DeLong)

write.csv(MRS3, "Newborn_MRS_Score3_data.csv", row.names=FALSE)

###############
### Score 4 ###
###############
# Calculate 1/no of CpGs(sum(weight(beta value - mean of controls/sd of controls)))
# weights = the meta-analysis effect size

MRS4 <- data1
for (i in cpgs){
	control_mean = mean(controls[,i])
	control_sd = sd(controls[,i])
	if (i %in% hypermethylated){
	weight = effect_sizes$OR_edited_for_score[which(effect_sizes$CpG ==i)]
	MRS4[,i] <- (((MRS4[,i] - control_mean) / control_sd)*weight)
	} else {
	weight = effect_sizes$OR_edited_for_score[which(effect_sizes$CpG ==i)]
	MRS4[,i] <- (((MRS4[,i] - control_mean) / control_sd)*weight)
	}
}

MRS4$score <- (rowSums(MRS4[, c(2:7)])/length(cpgs))

reg <- glm(Asthma_10YR~score, family=binomial, data=MRS4)
roc_data<- roc(reg$y, reg$fitted.values,ci=T)
#Area under the curve: 0.5321
#95% CI: 0.4773-0.5869 (DeLong)

write.csv(MRS4, "Newborn_MRS_Score4_data.csv", row.names=FALSE)

###############
### Score 5 ###
###############
# Calculate: Sum((effect size/average effect size of all cpgs)*
#			((beta value – median methylation value for controls_prev_reported if association=increased methylation) 
# 			OR (median methylation for controls_prev_reported if association=decreased methylation - beta))
# Modification from original proposed score - median methylation from controls taken from this study, not a previously reported one due to data availability

MRS5 <- data1

# Subset controls from dataset
controls <- subset(data1, Asthma_10YR==0)

# Identify CpGs which are hyper methylated
hypermethylated<- effect_sizes$CpG[which(effect_sizes$OR_edited_for_score >1)]

average_weight = (sum(effect_sizes$OR_edited_for_score))/length(effect_sizes$OR_edited_for_score)

for (i in cpgs){
	control_median = median(controls[,i])
	weight = effect_sizes$OR_edited_for_score[which(effect_sizes$CpG ==i)]/average_weight
	if (i %in% hypermethylated){
	MRS5[,i] <- (MRS5[,i] - control_median)*weight
	} else {
	MRS5[,i] <- (control_median - MRS5[,i])*weight
	}
}

MRS5$score <- (rowSums(MRS5[, c(2:7)])

reg <- glm(Asthma_10YR~score, family=binomial, data=MRS5)
roc_data<- roc(reg$y, reg$fitted.values,ci=T)
#Area under the curve: 0.523
#95% CI: 0.4667-0.5792 (DeLong)

write.csv(MRS5, "Newborn_MRS_Score5_data.csv", row.names=FALSE)

